library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer) ; library(ggpubr)
library(biomaRt)
library(polycor)
library(caret) ; library(ROCR) ; library(car) ; library(MLmetrics)
library(corrplot)
library(expss) ; library(knitr) ; library(kableExtra)
library(foreach) ; library(doParallel)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}


Load Dataset


clustering_selected = 'DynamicHybrid'
clusterings = read_csv('./../Data/clusters.csv')
clusterings$Module = clusterings[,clustering_selected] %>% data.frame %>% unlist %>% unname
assigned_module = clusterings %>% dplyr::select(ID, Module)

# Dataset created with 20_04_07_create_dataset.html
dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names=1)
rownames_dataset = rownames(dataset)
dataset = dataset %>% mutate(Module = clusterings$Module, gene.score = as.character(gene.score)) %>%
                      mutate(gene.score = ifelse(gene.score=='Others', 'None', gene.score)) %>%
          dplyr::select(-matches(clustering_selected))
rownames(dataset) = rownames_dataset

# Fix any Gene Significance that is NA
GS_missing = rownames(dataset)[is.na(dataset$GS)]
if(length(GS_missing)>0){
  # Gandal dataset
  load('./../Data/preprocessed_data.RData')
  datExpr = datExpr %>% data.frame
  rownames(datExpr) = datGenes$ensembl_gene_id
  for(g in GS_missing){
    dataset$GS[rownames(dataset) == g] = polyserial(as.numeric(datExpr[g,]), datMeta$Diagnosis)
  }
}



# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(dataset), mart=mart)


rm(getinfo, mart, rownames_dataset, GO_annotations, g, GS_missing)


The features that will be considered for the classification model will be the ones WGCNA uses to identify significant modules and genes:



Filtering the 2188 genes that were not assigned to any cluster (represented as the gray cluster)

rm_cluster = dataset[is.na(dataset$MTcor),'Module'] %>% unique %>% as.character

new_dataset = dataset %>% filter(Module != 'gray' & !is.na(MTcor)) %>% 
              dplyr::select(-c(matches(paste('pval|Module')), MMgray)) %>%
              mutate('absGS'=abs(GS), 'SFARI'=gene.score!='None') %>% dplyr::select(-gene.score)
rownames(new_dataset) = rownames(dataset)[dataset$Module != 'gray']


rm(rm_cluster)


Summary of the changes made to the original WGCNA variables:


  • Using Module Membership variables instead of binary module membership

  • Including a new variable with the absolute value of GS

  • Removing genes assigned to the gray module (unclassified genes)

  • Adding the Objective variable: Binary label indicating if it’s in the SFARI dataset or not

table_info = new_dataset %>% head(5) %>% t 

table_info %>% kable(caption = '(Transposed) features and their values for the first rows of dataset', 
                     col.names = colnames(table_info)) %>% kable_styling(full_width = F)
(Transposed) features and their values for the first rows of dataset
ENSG00000121410 ENSG00000156006 ENSG00000077616 ENSG00000099974 ENSG00000168060
MTcor 0.2573447 -0.6545644 0.1211137 -0.4513463 0.2116993
GS 0.1183599 0.1206639 0.1354277 -0.3896922 0.0841230
MM.00BCD9 0.0644115 -0.1169676 -0.2041374 -0.3904832 0.0364040
MM.00BA43 0.0486077 -0.2591544 -0.0991941 -0.3922393 0.0729529
MM.FF6A98 0.3057779 -0.0781020 -0.2580772 -0.1292127 -0.0291675
MM.00BC56 0.3111117 -0.0124827 -0.0573097 -0.4253966 -0.1500037
MM.F77868 0.2839652 -0.1604029 0.1011445 -0.5028896 0.0522059
MM.FF67A5 0.4713036 -0.1391043 -0.0590534 -0.4511605 -0.1685158
MM.00BDD3 -0.4393538 0.0568088 0.2853187 -0.0804350 0.1579245
MM.00C19D -0.1685511 -0.0432363 -0.0513851 -0.0987818 0.3341099
MM.4BB400 -0.1722485 0.0627343 0.1155019 -0.1726204 0.2902171
MM.92AA00 -0.1754884 0.1396419 -0.0140442 0.0518021 0.2567167
MM.C17FFF 0.3807888 -0.0080560 0.1446017 -0.2388879 0.2036230
MM.FA7377 0.5734399 -0.0775294 -0.0965566 0.0239152 -0.1815071
MM.F8766D 0.1919877 -0.1340837 -0.0213147 -0.2324723 0.1943054
MM.75AF00 0.2059054 -0.0087001 0.1305954 -0.2723498 0.0340117
MM.8EAB00 0.2176658 -0.1073831 0.0394892 -0.1828077 0.0484666
MM.5CB300 0.2663801 -0.0306979 0.0882588 -0.0901384 0.1276368
MM.B783FF 0.2147886 0.0112303 0.0715717 0.0108214 0.1217851
MM.7B97FF 0.4652016 0.0664160 -0.0277690 -0.0867307 -0.0781665
MM.ADA200 0.3379989 -0.1030886 0.0811644 -0.2019362 0.1305911
MM.C67DFF 0.0903324 -0.1455558 0.0896199 -0.3020723 0.3105090
MM.00BECD -0.0296735 -0.0853622 -0.1282270 0.4553441 0.1000229
MM.08B704 0.2554295 -0.2081957 -0.1680037 0.1980896 -0.1221893
MM.E96AF1 0.2779718 -0.0919590 -0.1953743 -0.0049898 -0.2208656
MM.A68AFF -0.1033092 -0.2523488 -0.0551115 0.1857865 -0.3026300
MM.7AAE00 0.0447873 -0.1427219 -0.1689380 -0.0309007 0.0611243
MM.F066E9 0.2233651 -0.2377401 -0.1699965 0.0876813 0.0027197
MM.F17D50 -0.0292865 -0.1584801 0.0282201 -0.0574894 -0.1272121
MM.FC7181 0.1955548 -0.1563406 -0.0808901 -0.1455875 -0.2342066
MM.EC823A -0.1421693 -0.0479980 0.1306663 0.0061587 0.0430525
MM.00C0BF 0.0602026 0.0366131 0.1567911 0.0171246 -0.0783151
MM.F564E4 -0.0945080 -0.0796375 0.3249078 -0.1612102 0.1784736
MM.CA7BFF -0.2499270 -0.0610186 0.1599421 -0.0499556 0.3620073
MM.96A900 0.0021634 -0.0528362 0.1451570 0.0311304 -0.0514997
MM.CB9600 0.0860467 0.0446830 -0.1322691 0.0638637 0.0027866
MM.E28900 -0.0600543 0.0004279 0.1534020 -0.2031415 0.1342603
MM.FF62C0 0.1655513 0.0327500 0.1570431 -0.1833142 -0.0654974
MM.00B5ED 0.0023296 0.0022908 0.0022458 -0.2142460 -0.1924226
MM.00B7E7 -0.2806639 -0.1595264 -0.0208645 -0.2213569 -0.2211422
MM.E68617 -0.2133244 -0.1312299 0.0601392 0.1254564 -0.2598543
MM.9A8EFF 0.1034601 0.0069406 0.0731890 -0.1689500 0.0436587
MM.00B9E2 0.2041459 -0.0822776 -0.0035590 0.0920156 -0.3574428
MM.7299FF -0.0564415 -0.0849619 -0.1945061 -0.1822992 0.0136415
MM.89AC00 -0.1328743 -0.0565570 -0.0240428 -0.0454598 -0.1240515
MM.00BFC3 -0.4811593 -0.1474751 0.0144962 0.2147579 0.1738595
MM.00C1A5 -0.5234318 -0.0268545 0.1775310 0.0760701 0.1374761
MM.FE61CE -0.3851864 -0.2307167 0.2868367 -0.1409996 0.3080282
MM.00BE71 -0.2538038 -0.0353115 0.1252727 0.1525828 0.2520486
MM.54B400 -0.1514179 -0.2036330 0.0843112 0.1380505 0.2496057
MM.00B0F6 -0.0883521 -0.0886837 0.0705816 0.2021291 -0.3630725
MM.00BADF -0.1211324 -0.1143947 0.0647088 0.2359028 -0.1642637
MM.00A6FF -0.0139928 -0.1112450 -0.0641009 0.2932709 -0.1716642
MM.00BC50 -0.0240337 -0.2148826 0.1844993 0.1095694 0.0334840
MM.D59100 0.1836014 -0.1025401 0.1492553 -0.0759777 -0.2536675
MM.E88526 -0.2793058 0.1734833 0.3868303 -0.0040529 -0.0164115
MM.00C0B4 -0.0358763 0.1270857 0.2071526 -0.0691158 -0.2035308
MM.BA9E00 -0.0402595 0.2013498 0.0598957 0.2192458 -0.3899604
MM.E48800 -0.4431052 0.0541640 0.0707599 0.3440599 -0.1910295
MM.D376FF -0.4160863 0.0803843 0.0632989 0.2854182 -0.1405067
MM.F97572 -0.2902610 -0.0218445 0.1870905 0.2289007 -0.1113146
MM.F47B5C -0.2861021 0.0404414 0.0949334 0.2196560 -0.1711716
MM.FF62BC -0.3348724 0.1588491 0.0667566 0.0335587 -0.1806243
MM.B0A100 -0.1210961 -0.0260663 0.0219100 -0.1385001 0.2710036
MM.00B2F3 -0.0161654 -0.1182401 0.1845002 -0.4118363 0.1090137
MM.FF68A0 0.0104264 -0.1505983 0.1721016 -0.3815489 0.2726866
MM.00BBDC -0.2886580 -0.0768398 0.2736795 -0.0324868 0.1319011
MM.A2A600 -0.0725262 -0.1351666 0.1722917 -0.1562774 0.0499614
MM.FB727C -0.2403827 -0.2762439 0.2465525 -0.0568520 0.2026388
MM.F57962 0.0557382 -0.0448817 0.1135430 -0.4234336 -0.0246768
MM.EF7F49 -0.0865946 -0.0700757 0.1706351 0.0709166 0.0422290
MM.F862DE -0.2814413 0.0691592 0.1178086 -0.0593115 -0.0185101
MM.FF61CB 0.0668781 -0.0019335 -0.0284921 -0.0886036 -0.1812855
MM.F365E7 0.1202115 -0.0376484 -0.2590009 0.1600499 -0.1300320
MM.00BD5C -0.0698285 0.0842668 -0.1551238 0.0667640 0.1461961
MM.E16FF8 -0.1360128 0.1895402 -0.1614543 0.2951263 -0.0201645
MM.41A0FF -0.0468889 0.1648861 -0.2622889 0.0123748 -0.1971404
MM.D674FD 0.0585867 -0.0779747 -0.1948630 0.0868083 -0.0823858
MM.00A8FF 0.0149323 -0.0856510 -0.1211812 -0.0454125 0.0316724
MM.00AEFA -0.1675213 -0.0450692 -0.0757147 -0.0100648 -0.0089671
MM.69B100 0.1293225 0.1854102 -0.1004264 -0.0562720 -0.1104397
MM.DC8D00 0.1412469 0.1899825 -0.1233355 0.0642057 0.0355889
MM.6FB000 0.2209632 -0.0064714 0.0969926 -0.2044851 0.1264880
MM.FF66A9 0.0121733 -0.0549964 -0.1895272 0.3208181 -0.0676024
MM.00ABFD 0.1403745 0.0754582 0.0003354 0.4332209 -0.2567813
MM.00C1A1 0.3745298 -0.0385659 0.0651131 0.3913906 -0.2808409
MM.00ACFB -0.0015590 -0.1732971 -0.0125635 0.4088120 -0.0578036
MM.00C091 0.0294668 -0.0331666 -0.1302354 0.3142057 -0.2260553
MM.00C0BC 0.2280528 0.0910591 -0.2142622 0.0264821 -0.2029821
MM.00BE6C 0.3219057 -0.0960033 -0.1413415 0.1133188 -0.1317948
MM.519FFF 0.2869834 -0.0431555 -0.0873560 0.0890453 -0.1325873
MM.84AD00 0.1795734 0.1028054 0.0082200 0.0733616 0.0133560
MM.B79F00 0.2608358 -0.0589036 -0.0971093 0.0753399 -0.0955133
MM.00B81B 0.0482914 0.0399692 -0.0799496 0.3675368 -0.1355341
MM.9390FF 0.2817463 0.0981325 0.1073574 0.0144508 -0.2265971
MM.00C08C 0.1337248 0.1156846 -0.0457598 0.1242437 -0.2993837
MM.8B92FF 0.1309632 -0.0087469 -0.0855231 0.2144029 -0.2428341
MM.CE9500 0.1117162 0.0681770 -0.1036993 0.0058761 -0.1967030
MM.C09B00 0.0736034 0.1206557 -0.1235497 0.0887377 -0.4220942
MM.689BFF 0.0176764 0.0263071 -0.0967239 0.1715337 -0.3086085
MM.BD9D00 0.1252527 -0.0457977 -0.0393137 0.0377663 -0.1814538
MM.D89000 0.2829346 0.0569349 0.0067017 -0.0428331 -0.1800564
MM.F27C56 -0.1802223 0.1513147 0.2615533 -0.0847743 0.1263765
MM.FF63B8 -0.0464448 0.2772324 0.3367183 0.1401354 0.0307738
MM.A6A500 0.0038878 0.1355912 0.0031874 -0.0030741 -0.0230629
MM.E76CF3 -0.2302061 0.1279969 0.0493847 -0.0289088 0.0496309
MM.AC87FF -0.0008319 0.1521324 0.1970220 0.0366805 0.0860195
MM.B285FF -0.0122451 0.1180429 -0.0268260 0.0040965 0.0551470
MM.BC81FF 0.1986401 -0.0199746 -0.0776289 0.1240520 -0.0236505
MM.C69900 -0.1223818 0.1627652 -0.0409527 0.5703819 -0.3113755
MM.A08CFF 0.1026062 0.2520008 0.0460714 0.4904329 -0.3051399
MM.FF699C 0.0397646 0.1438477 0.0459507 0.7126645 -0.3252229
MM.A9A300 0.3430757 0.1439600 -0.2710043 0.2539190 -0.1727651
MM.CF78FF -0.0271420 0.3041182 -0.0543828 0.3215367 -0.2979922
MM.FF63B5 0.0619935 0.2455039 0.0737683 0.3301560 -0.1200783
MM.B3A000 -0.3128685 0.2593817 0.1784808 0.5006428 -0.1726234
MM.EE8042 -0.2501559 -0.0789342 -0.1325453 0.4651907 -0.1000515
MM.FD6F86 -0.2212108 -0.0330105 0.0884504 0.4613586 0.1255511
MM.FF65AD -0.0429387 0.0802066 0.1457200 0.3218229 0.0932522
MM.D39300 0.1054059 -0.1135297 -0.0635519 -0.0133794 -0.2527259
MM.DA8F00 -0.2334678 -0.1600503 -0.1734183 -0.0146120 -0.1334378
MM.E08A00 -0.1280337 -0.2265972 -0.1267886 -0.0386714 -0.0898398
MM.36B600 -0.1348787 0.1477321 -0.0562851 -0.0858334 -0.1130110
MM.00BF7F -0.0584784 0.0079936 -0.1102348 0.1356307 -0.3023820
MM.C39A00 -0.1238928 -0.0255296 -0.1297396 -0.1292071 -0.4003534
MM.00BE67 -0.3232137 0.1236599 -0.1616663 0.4459072 -0.0575986
MM.00BA3B -0.2706986 0.0334877 -0.1265960 0.2905155 -0.0278966
MM.26B700 -0.1315751 0.0971234 -0.0953088 -0.0540016 -0.1346682
MM.FF61C4 -0.2821833 -0.2479993 -0.1197745 0.0687274 -0.0021904
MM.00B4EF -0.4754960 -0.0839435 -0.0418083 0.0861220 0.0251216
MM.00C0B8 -0.2928022 0.0250557 -0.1325104 0.2215813 -0.1106564
MM.00B8E5 -0.5642678 0.0580710 0.1249291 0.0229783 0.2584034
MM.7FAE00 -0.3945889 0.0376152 0.0640891 0.0521203 -0.0051855
MM.9EA700 -0.1181893 0.0360368 0.1513191 0.2629816 0.2349404
MM.EA8331 -0.2555895 -0.0836345 0.1417044 0.0521215 0.1083201
MM.00A4FF -0.2645376 -0.1777527 0.1676596 0.0067842 0.3910741
MM.00BD61 -0.4065643 0.0110945 -0.0130769 0.1403207 0.1912629
MM.DE8C00 -0.4730860 0.0810353 -0.0640526 0.0673969 0.2635656
MM.FF6B93 -0.5372781 -0.0605542 -0.0145547 -0.0052325 0.2506822
MM.00C1A9 -0.1894042 0.1026223 -0.0985574 -0.0898765 0.1820065
MM.F962DB 0.0862055 0.2158087 0.0144769 -0.1135334 -0.0083465
MM.FE6E8A -0.0701145 0.2428970 -0.1130249 0.0360659 0.0110110
MM.00A9FF 0.0365159 0.2241052 -0.0318342 0.0852688 -0.1933572
MM.2DA2FF -0.1423111 0.2318031 -0.0512034 0.2603327 -0.2758033
MM.00C084 -0.2948779 0.1170022 -0.0861731 -0.0869003 0.0125234
MM.00C199 -0.1309792 0.1724535 -0.0538031 0.0545851 0.0541089
MM.00BF76 -0.3321797 0.0406058 0.0050976 0.0765755 -0.0626544
MM.FF6D8F -0.2385730 0.1065616 0.1887381 0.0652792 0.0881456
MM.00C1AD 0.1275753 0.2044004 0.0141878 0.3082653 -0.0832395
MM.00AFF8 -0.3963993 0.0296292 -0.0657242 0.4655795 0.1732653
MM.00BCD6 -0.1147207 -0.0203639 -0.0430420 0.2248616 0.0382269
MM.00BB4A -0.1160572 0.0264655 0.0911445 0.1711001 -0.1106245
MM.FF64B1 -0.1330265 -0.0936888 -0.1052307 0.3350418 -0.1887611
MM.5D9DFF -0.2920405 -0.1156832 -0.0583797 0.2901782 0.0173481
MM.FB61D8 -0.2282430 -0.1066564 -0.0611022 0.0681556 0.0093372
MM.8394FF 0.0409601 -0.1004581 0.0310718 0.1981330 0.0294392
MM.DA73FB -0.0048091 0.0714307 -0.0384256 -0.0107140 -0.1106684
MM.FD61D1 0.0341298 0.1265510 0.0448392 0.1004148 -0.1006089
MM.00BF7B 0.0163006 0.0919436 -0.0110380 -0.1879570 -0.0497140
MM.00C095 0.2932353 0.0304633 0.0345144 -0.4256837 -0.1940423
MM.C89800 0.1891880 -0.0720418 0.2670455 -0.0876392 0.0366375
MM.F663E1 0.0822632 -0.0448407 -0.0269092 0.1618438 -0.0422909
MM.00B932 0.3441750 -0.1236846 -0.0675839 0.1401543 0.0265007
MM.00BFC6 0.0906250 -0.0254353 -0.0882687 0.3733545 0.0512347
MM.E46DF6 0.4846314 -0.0474662 -0.1946545 0.2704567 -0.2284121
MM.63B200 0.3493821 0.3344805 0.1714011 0.2443688 -0.4407672
MM.00C0B1 0.2052670 -0.0415786 0.0954790 0.1950309 0.0459709
MM.9AA800 0.1519155 0.0729935 0.0952729 0.0906603 0.0919619
MM.00BDD0 0.4440557 0.0062351 0.0194160 -0.0430723 0.0101382
MM.D09400 0.3129375 -0.1019755 0.2007590 -0.1615614 0.0683471
MM.FF61C7 0.1058073 -0.0723919 0.0065686 -0.0390892 0.2903613
MM.EC69EF -0.0627677 -0.0946343 -0.0383395 -0.1692150 0.1869633
MM.00B6EA 0.0070692 -0.0097559 -0.0355678 0.0821512 0.2235396
MM.00C088 0.0127975 0.1396992 -0.2323556 0.3156530 0.0411680
MM.00B928 -0.1542532 -0.0501294 -0.0115991 0.0183367 0.3016143
MM.41B500 -0.2210792 -0.1710673 -0.2188296 0.1333564 0.3318016
MM.EE67EC -0.0847306 -0.1965342 -0.0599840 0.1783801 0.3514116
MM.FC61D5 0.1072275 -0.0583836 -0.0634250 -0.1174099 0.0043430
MM.00B3F1 0.1639630 -0.0923955 -0.3070866 -0.1574762 0.0525390
MM.00BEC9 0.5255341 -0.0456876 -0.2048339 0.1289198 -0.0531900
MM.DD71FA 0.1930646 -0.1111404 -0.2358544 0.0436290 0.2429990
absGS 0.1183599 0.1206639 0.1354277 0.3896922 0.0841230
SFARI 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
rm(table_info)
original_dataset = dataset
dataset = new_dataset

rm(new_dataset)

The final dataset contains 13204 observations (genes) and 183 variables



Exploratory Analysis


PCA of Variables


The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and both SFARI and absGS are in the middle of both groups (just like with Gandal’s and Gupta’s datasets)

pca = dataset %>% mutate(SFARI = as.numeric(SFARI)) %>% t %>% prcomp

plot_data = data.frame('ID'=colnames(dataset), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2],
                       type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
                            ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
                            ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))


mtcor_by_module = original_dataset %>% dplyr::select(Module, MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')

plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')

ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(aes(id=ID)) +
         xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
         ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
         scale_colour_distiller(palette = 'RdBu', na.value = 'darkgrey') + theme_minimal() +
         ggtitle('PCA of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, pca)


PCA of Samples


  • The two main patterns that seem to characterise the genes are their Gene Significance and the Module-Diagnosis correlation of their corresponding module

  • Mean Expression doesn’t seem to play an important role

  • SFARI Genes seem to be evenly distributed everywhere (perhaps they have a slightly higher distribution in the 2nd principal component?)

  • It’s not clear what the 2nd principal component is capturing

# Mean Expression data
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
rownames(datExpr) = datGenes$ensembl_gene_id
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))

# PCA
pca = dataset %>% t %>% prcomp

plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2], 
                       'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
            mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')

p1 = plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     theme_minimal() + ggtitle('Genes coloured by Module-Diagnosis correlation') +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme(legend.position='bottom')

p2 = plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by Gene Significance') + theme(legend.position='bottom')

p3 = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(alpha = plot_data$alpha) +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')
p3 = ggExtra::ggMarginal(p3, type='density', groupColour=TRUE, size=10)

p4 = plot_data %>% ggplot(aes(PC1, PC2, color=meanExpr)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by mean level of expression') + theme(legend.position='bottom')

grid.arrange(p1, p2, p3, p4, nrow=2)

rm(pca, datExpr, datGenes, datMeta, dds, DE_info, mean_expr, p1, p2, p3, p4)



Dividing samples into Training and Test Sets


4.85% of the observations are positive. This can be a problem when training the classification model, so the samples in the training set should be balanced between classes before the model is trained.

table_info = dataset %>% apply_labels(SFARI = 'SFARI')

cro(table_info$SFARI)
 #Total 
 SFARI 
   FALSE  12563
   TRUE  641
   #Total cases  13204
rm(table_info)

To divide our samples into training and test sets:

set.seed(123)

sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>% 
                left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score), 
                          by = 'ID') %>% 
                mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))

train_idx = createDataPartition(sample_scores$gene.score, p = 0.7, list = FALSE)
train_set = dataset[train_idx,]
test_set = dataset[-train_idx,]


rm(sample_scores, train_idx)


Label distribution in training set


To fix this class imbalance, we are going to use SMOTE, an over-sampling technique that over-samples the minority class (SFARI Genes) by creating synthetic examples, in the training set

cro(train_set$SFARI)
 #Total 
 train_set$SFARI 
   FALSE  8795
   TRUE  451
   #Total cases  9246


Labels distribution in test set


This set is used just to evaluate how well the model performs, so the class imbalance is not a problem here

cro(test_set$SFARI)
 #Total 
 test_set$SFARI 
   FALSE  3768
   TRUE  190
   #Total cases  3958


Logistic Regression


Train model

# https://shiring.github.io/machine_learning/2017/04/02/unbalanced
# https://topepo.github.io/caret/using-your-own-model-in-train.html#Illustration5


train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)

k_fold = 10
cv_repeats = 5
smote_over_sampling = trainControl(method = 'repeatedcv', number = k_fold, repeats = cv_repeats, 
                                   verboseIter = FALSE, classProbs = TRUE, savePredictions = 'final', 
                                   summaryFunction = twoClassSummary, sampling = 'smote')

# Using ROC as metric because it doesn't depend on the threshold
fit = caret::train(SFARI ~ ., data = train_set, method = 'glm', family = 'binomial', metric = 'ROC',
                   trControl = smote_over_sampling)

# There is some perfect multicollinearity that doesn't let us do the vif analysis, so I'll remove those variables (you cannot use alias in caret::train, so I had to train the model again directly with glm)
ld.vars = attributes(alias(glm(SFARI~., data = train_set, family = 'binomial'))$Complete)$dimnames[[1]]

# Remove the linearly dependent variables variables
formula.new = as.formula( paste('SFARI ~ .', paste(ld.vars, collapse='-'), sep='-') )

# Retrain model without these variables
fit = caret::train(formula.new, data = train_set, method = 'glm', family = 'binomial', metric = 'ROC',
                   trControl = smote_over_sampling)


rm(smote_over_sampling, ld.vars, formula.new, k_fold, cv_repeats)


Performance


The model has an AUC of 0.6112

But the features are strongly correlated, which inflates the standard error of the coefficients, making them no longer interpretable, so perhaps it would be better to use another model

# VIF
plot_data = data.frame('Feature' = car::vif(fit$finalModel) %>% sort %>% names,
                       'VIF' = car::vif(fit$finalModel) %>% sort %>% unname) %>%
            mutate(outlier = VIF>10)

plot_data %>% ggplot(aes(reorder(Feature, -VIF), VIF, fill = !outlier)) + geom_bar(stat='identity') + 
              scale_y_log10() + geom_hline(yintercept = 10, color = 'gray', linetype = 'dashed') + 
              xlab('Model Features') + ggtitle('Variance Inflation Number for each Feature') + theme_minimal() +
              theme(legend.position = 'none', axis.text.x = element_text(angle = 90, hjust = 1))

rm(plot_data)

Correlation plot

corrplot.mixed(cor(train_set[,-ncol(train_set)]), lower = 'number', lower.col = 'gray', number.cex = .6, 
               tl.pos = 'l', tl.col = '#666666')


Possible solutions to Multicollinearity:


  1. Remove all variables with a VIF>10: We would lose all but two of our variables, not ideal

  2. Do Principal Component Regression: We would lose the relation between the prediction and the original features, which could be interesting to study

  3. Don’t do anything: Multicollinearity affects the coefficients and p-values of the regression, but it doesn’t affect the predictions, precision of the predictions or the goodness-of-fit statistics ref, but as with the previous option, we cannot study the coefficients of the regression

  4. Use Ridge Regression: The penalty it gives to high coefficients reduces the variance introduced by the correlation, making the coefficients interpretable again




Ridge Regression


Notes:

### DEFINE FUNCTIONS

create_train_test_sets = function(p, seed){
  
  # Get SFARI Score of all the samples so our train and test sets are balanced for each score
  sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>%
                  left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score), 
                             by = 'ID') %>% 
                  mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))

  set.seed(seed)
  train_idx = createDataPartition(sample_scores$gene.score, p = p, list = FALSE)
  
  train_set = dataset[train_idx,]
  test_set = dataset[-train_idx,]
  
  return(list('train_set' = train_set, 'test_set' = test_set))
}



run_model = function(p, seed){
  
  # Create train and test sets
  train_test_sets = create_train_test_sets(p, seed)
  train_set = train_test_sets[['train_set']]
  test_set = train_test_sets[['test_set']]
  
  # Train Model
  train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)
  lambda_seq = 10^seq(1, -4, by = -.1)
  set.seed(seed)
  k_fold = 10
  cv_repeats = 5
  smote_over_sampling = trainControl(method = 'repeatedcv', number = k_fold, repeats = cv_repeats,
                                     verboseIter = FALSE, classProbs = TRUE, savePredictions = 'final', 
                                     summaryFunction = twoClassSummary, sampling = 'smote')
  fit = train(SFARI ~., data = train_set, method = 'glmnet', trControl = smote_over_sampling, metric = 'ROC',
              tuneGrid = expand.grid(alpha = 0, lambda = lambda_seq))
  
  # Predict labels in test set
  predictions = fit %>% predict(test_set, type = 'prob')
  preds = data.frame('ID' = rownames(test_set), 'prob' = predictions$SFARI) %>% mutate(pred = prob>0.5)

  # Measure performance of the model
  acc = mean(test_set$SFARI==preds$pred)
  prec = Precision(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  rec = Recall(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  F1 = F1_Score(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  pred_ROCR = prediction(preds$prob, test_set$SFARI)
  AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
  
  # Extract coefficients from features
  coefs = coef(fit$finalModel, fit$bestTune$lambda) %>% as.vector
  
  return(list('acc' = acc, 'prec' = prec, 'rec' = rec, 'F1' = F1, 'AUC' = AUC, 'preds' = preds, 'coefs' =coefs))
}


### RUN MODEL

# Parameters
p = 0.75

n_iter = 25
seeds = 123:(123+n_iter-1)

# Store outputs
acc = c()
prec = c()
rec = c()
F1 = c()
AUC = c()
predictions = data.frame('ID' = rownames(dataset), 'SFARI' = dataset$SFARI, 'prob' = 0, 'pred' = 0, 'n' = 0)
coefs = data.frame('var' = c('Intercept', colnames(dataset[,-ncol(dataset)])), 'coef' = 0)

for(seed in seeds){
  
  # Run model
  model_output = run_model(p, seed)
  
  # Update outputs
  acc = c(acc, model_output[['acc']])
  prec = c(prec, model_output[['prec']])
  rec = c(rec, model_output[['rec']])
  F1 = c(F1, model_output[['F1']])
  AUC = c(AUC, model_output[['AUC']])
  preds = model_output[['preds']]
  coefs$coef = coefs$coef + model_output[['coefs']]
  update_preds = preds %>% dplyr::select(-ID) %>% mutate(n=1)
  predictions[predictions$ID %in% preds$ID, c('prob','pred','n')] = predictions[predictions$ID %in% 
                                                                      preds$ID, c('prob','pred','n')] +
                                                                    update_preds
}

coefs = coefs %>% mutate(coef = coef/n_iter)
predictions = predictions %>% mutate(prob = prob/n, pred_count = pred, pred = prob>0.5)


rm(p, seeds, update_preds, create_train_test_sets, run_model)


To summarise in a single value the predictions of the models:

test_set = predictions %>% filter(n>0) %>% 
           left_join(dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, GS, MTcor), by = 'ID')
rownames(test_set) = predictions$ID[predictions$n>0]


Performance metrics


Confusion matrix

conf_mat = test_set %>% apply_labels(SFARI = 'Actual Labels', 
                                     prob = 'Assigned Probability', 
                                     pred = 'Label Prediction')

cro(conf_mat$SFARI, list(conf_mat$pred, total()))
 Label Prediction     #Total 
 FALSE   TRUE   
 Actual Labels 
   FALSE  10100 2452   12552
   TRUE  410 228   638
   #Total cases  10510 2680   13190
rm(conf_mat)


Accuracy: Mean = 0.7758 SD = 0.0165


Precision: Mean = 0.0845 SD = 0.0078


Recall: Mean = 0.3701 SD = 0.0382


F1 score: Mean = 0.1374 SD = 0.0121


ROC Curve: Mean = 0.6262 SD = 0.0224

pred_ROCR = prediction(test_set$prob, test_set$SFARI)

roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
#auc = performance(pred_ROCR, measure='auc')@y.values[[1]]

plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(mean(AUC),2),')'), col='#009999')
abline(a=0, b=1, col='#666666')


Lift Curve

lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')

rm(pred_ROCR, roc_ROCR, AUC, lift_ROCR)




Coefficients


GS and absGS have very small coefficients

gene_corr_info = dataset %>% mutate('ID' = rownames(dataset)) %>% dplyr::select(ID, MTcor, SFARI) %>% 
                 left_join(assigned_module, by ='ID') %>% mutate(Module = gsub('#','',Module))

coef_info = coefs %>% mutate('feature' = gsub('MM.','',var)) %>% 
            left_join(gene_corr_info, by = c('feature' = 'Module')) %>% 
            dplyr::select(feature, coef, MTcor, SFARI) %>% group_by(feature, coef, MTcor) %>% 
            summarise('SFARI_perc' = mean(SFARI)) %>% arrange(desc(coef))

coef_info %>% dplyr::select(feature, coef) %>% filter(feature %in% c('Intercept','GS','absGS','MTcor')) %>%
              dplyr::rename('Feature' = feature, 'Coefficient' = coef) %>% 
              kable(align = 'cc', 
                    caption = 'Regression Coefficients (filtering MM features)') %>% 
              kable_styling(full_width = F)
Regression Coefficients (filtering MM features)
Feature Coefficient
MTcor 0.1707838
absGS -0.0286801
GS -0.0826057
Intercept -0.4455355


There is a positive relation between the coefficient assigned to the membership of each module and the enrichment (using ORA) in SFARI genes that are assigned to that module

load('./../Data/ORA.RData')

enrichment_SFARI_info = data.frame('Module'=as.character(), 'SFARI_enrichment'=as.numeric())
for(m in names(enrichment_SFARI)){
  m_info = enrichment_SFARI[[m]]
  enrichment = 1-ifelse('SFARI' %in% m_info$ID, m_info$pvalue[m_info$ID=='SFARI'],1)
  enrichment_SFARI_info = enrichment_SFARI_info %>% 
                          add_row(Module = gsub('#','',m), SFARI_enrichment = enrichment)
}

plot_data = coef_info %>% dplyr::rename('Module' = feature) %>% 
            left_join(enrichment_SFARI_info, by = 'Module') %>% filter(!is.na(MTcor))

ggplotly(plot_data %>% ggplot(aes(coef, SFARI_enrichment)) + 
         geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
         geom_point(aes(id = Module), color = paste0('#',plot_data$Module), alpha=0.7) + 
         theme_minimal() + xlab('Coefficient') + 
         ylab('SFARI Genes Enrichment'))
rm(enrichment_old_SFARI, enrichment_DGN, enrichment_DO, enrichment_GO, enrichment_KEGG, enrichment_Reactome,
   m, m_info, enrichment)


There doesn’t seem to be a relation between the coefficient and the correlation of the module and the diagnosis.

This is not a surprise since we knew that there was a negative relation between SFARI genes and Module-Diagnosis correlation from Preprocessing/Gandal/AllRegions/RMarkdowns/20_04_03_WGCNA_modules_EA.html. The fact that there is no relation between coefficient and Module-Diagnosis correlation could even be a good sign that the model is picking some biological signal as well as the SFARI patterns (since the relation with the biological signals is positive)

ggplotly(coef_info %>% dplyr::rename('Module' = feature) %>% filter(!is.na(MTcor)) %>%
              ggplot(aes(coef, MTcor)) +  geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
              geom_point(aes(id = Module), color = paste0('#',coef_info$feature[!is.na(coef_info$MTcor)]), 
                         alpha = 0.7) + 
              theme_minimal() + xlab('Coefficient') + 
              ylab('Module-Diagnosis correlation'))




Analyse Results


Score distribution by SFARI Label


SFARI genes have a higher score distribution than the rest, but the overlap is large

plot_data = test_set %>% dplyr::select(prob, SFARI)

ggplotly(plot_data %>% ggplot(aes(prob, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
         geom_vline(xintercept = mean(plot_data$prob[plot_data$SFARI]), color = '#00C0C2', linetype = 'dashed') +
         geom_vline(xintercept = mean(plot_data$prob[!plot_data$SFARI]), color = '#FF7371', linetype = 'dashed') +
         theme_minimal() + ggtitle('Model score distribution by SFARI Label'))


Score distribution by SFARI Score


Even though we didn’t use the actual SFARI Scores to train the model, but instead we grouped them all together, there seems to be a statistically significant positive relation between the SFARI scores and the probability assigned by the model

plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, prob) %>%
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
            mutate(gene.score = ifelse(gene.score=='None', ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'), 
                                       gene.score)) %>%
            dplyr::select(ID, prob, gene.score) %>% apply_labels(gene.score='SFARI Gene score')

cro(plot_data$gene.score)
 #Total 
 SFARI Gene score 
   1  105
   2  168
   3  365
   Neuronal  782
   Others  11770
   #Total cases  13190
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_prob = mean(prob))

comparisons = list(c('1','2'), c('2','3'), c('3','Neuronal'), c('Neuronal','Others'),
                   c('1','3'), c('3','Others'), c('2','Neuronal'),
                   c('1','Neuronal'), c('2','Others'), c('1','Others'))
increase = 0.08
base = 0.9
pos_y_comparisons = c(rep(base, 4), rep(base + increase, 2), base + 2:5*increase)

plot_data %>% ggplot(aes(gene.score, prob, fill=gene.score)) + 
              geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
              stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test', 
                                 method.args = list(var.equal = FALSE), label.y = pos_y_comparisons, 
                                 tip.length = .02) +
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
              ggtitle('Distribution of probabilities by SFARI score') +
              xlab('SFARI score') + ylab('Probability') + theme_minimal() + theme(legend.position = 'none')

rm(mean_vals, increase, base, pos_y_comparisons)


Genes with the highest Probabilities


  • Considering the class imbalance in the test set (1:19), there are many more SFARI scores in here (1:4)

  • High concentration of genes with a SFARI Score of 1

test_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(test_set)) %>% 
             arrange(desc(prob)) %>% top_n(50, wt=prob) %>%
             left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID')  %>%
             mutate(gene.score = ifelse(gene.score=='None', ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'), 
                                        gene.score)) %>%
             left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
             dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob, 'ModuleDiagnosis_corr' =MTcor,
                           'GeneSignificance' = GS) %>%
             mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr, 4), Probability = round(Probability, 4), 
                    GeneSignificance = round(GeneSignificance, 4)) %>%
             dplyr::select(GeneSymbol, GeneSignificance, ModuleDiagnosis_corr, Module, Probability,
                           gene.score) %>%
             kable(caption = 'Genes with highest model probabilities from the test set') %>% 
             kable_styling(full_width = F)
Genes with highest model probabilities from the test set
GeneSymbol GeneSignificance ModuleDiagnosis_corr Module Probability gene.score
GABBR2 0.1204 0.2418 #A68AFF 0.8274 3
LONRF2 -0.0686 -0.2919 #00BFC3 0.7981 Others
CACNA1D -0.4023 -0.2919 #00BFC3 0.7929 2
PLXDC2 0.3497 0.5272 #F564E4 0.7915 Others
DOCK5 -0.3920 -0.2919 #00BFC3 0.7756 Others
FA2H -0.2586 -0.2919 #00BFC3 0.7756 Others
GLS 0.2521 0.3292 #EC823A 0.7751 Others
MYRF -0.3458 -0.2919 #00BFC3 0.7715 Others
KIAA1549L 0.2070 0.2418 #A68AFF 0.7715 Others
PLXNC1 0.5660 0.5125 #89AC00 0.7691 Others
SH3TC2 -0.3946 -0.2919 #00BFC3 0.7662 Others
PCLO 0.2970 0.4802 #CA7BFF 0.7641 Others
CNTN2 -0.3848 -0.2919 #00BFC3 0.7617 Neuronal
PIEZO2 -0.0962 -0.2919 #00BFC3 0.7610 Others
HHIP -0.1148 -0.2919 #00BFC3 0.7610 Others
IGF1 -0.0617 -0.0683 #FF6B93 0.7606 Others
PTPRT -0.0161 0.2203 #F17D50 0.7603 3
SCN2A -0.2300 0.3292 #EC823A 0.7590 1
PAK3 0.2199 0.2203 #F17D50 0.7589 Others
ST18 -0.3694 -0.2919 #00BFC3 0.7587 Others
GRIA1 0.2054 0.4910 #E28900 0.7579 2
PLEKHH1 -0.2875 -0.2919 #00BFC3 0.7575 Others
CARNS1 -0.3334 -0.2919 #00BFC3 0.7563 Others
EVI2A -0.0599 -0.2919 #00BFC3 0.7562 Others
CALB2 0.1147 -0.5008 #00BADF 0.7562 Others
KIAA1217 0.3683 0.5621 #4BB400 0.7523 Others
LDB3 -0.4007 -0.2919 #00BFC3 0.7486 Others
PLEKHA6 0.1433 0.2418 #A68AFF 0.7486 Others
LRP1 0.3207 0.3838 #B79F00 0.7484 2
SHISA9 -0.0285 0.2203 #F17D50 0.7479 Neuronal
GALNTL6 0.0407 -0.5008 #00BADF 0.7459 Others
SRGAP1 0.3651 0.1211 #FB727C 0.7459 Others
ERBB3 -0.4488 -0.2919 #00BFC3 0.7450 Neuronal
TRIM59 0.0192 -0.2919 #00BFC3 0.7435 Others
TP53I11 -0.1272 0.5125 #89AC00 0.7432 Others
SRGAP3 0.1742 0.2418 #A68AFF 0.7386 3
NEGR1 0.2303 -0.0728 #DE8C00 0.7376 3
PCDH19 -0.0972 0.2203 #F17D50 0.7372 1
HPCA -0.0751 0.2117 #FF61C7 0.7372 Others
LRRC63 0.0787 -0.5722 #00BE71 0.7355 Others
SGIP1 0.2962 0.3838 #B79F00 0.7350 Others
CDH19 -0.1403 -0.2919 #00BFC3 0.7347 Others
HERC2 0.3163 0.3738 #FC7181 0.7342 Others
YWHAB 0.4306 0.6234 #519FFF 0.7341 Others
PRELP 0.2110 0.5125 #89AC00 0.7338 Others
DTD1 -0.0972 -0.2897 #00BCD6 0.7329 Others
CPO -0.0223 0.1211 #FB727C 0.7328 Others
HUWE1 0.3412 0.2418 #A68AFF 0.7323 Others
ABCA8 0.0524 -0.2919 #00BFC3 0.7323 Others
PLD1 -0.1953 -0.2919 #00BFC3 0.7320 Others





Negative samples distribution


The objective of this model is to identify candidate SFARI genes. For this, we are going to focus on the negative samples (the non-SFARI genes)

negative_set = test_set %>% filter(!SFARI)

negative_set_table = negative_set %>% apply_labels(prob = 'Assigned Probability', 
                                                   pred = 'Label Prediction')

cro(negative_set_table$pred)
 #Total 
 Label Prediction 
   FALSE  10100
   TRUE  2452
   #Total cases  12552

2452 genes are predicted as ASD-related


negative_set %>% ggplot(aes(prob)) + geom_density(color='#F8766D', fill='#F8766D', alpha=0.5) +
                 geom_vline(xintercept=0.5, color='#333333', linetype='dotted') + xlab('Probability') +
                 ggtitle('Probability distribution of the Negative samples in the Test Set') + 
                 theme_minimal()




Probability and Gene Significance


  • There’s a lot of noise, but the probability the model assigns to each gene seems to have a positive relation with the Gene Significance (under-expressed genes having on average the lower probabilities and over-expressed genes the highest) (this pattern was the opposite in Gandal’s dataset and the same in Gupta’s)
negative_set %>% ggplot(aes(prob, GS, color = MTcor)) + geom_point() + 
                 geom_smooth(method = 'loess', color = '#666666') +
                 geom_hline(yintercept = 0, color='gray', linetype='dashed') + 
                 xlab('Probability') + ylab('Gene Significance') +
                 scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) + 
                 ggtitle('Relation between Probability and Gene Significance') + theme_minimal()




Probability and Module-Diagnosis correlation


  • There’s not a strong relation between the Module-Diagnosis correlation of the genes assigned module and the probability assigned by the model

  • The model seems to assign higher probabilities to genes belonging the modules with high positive module-Dianosis correlations than to genes belonging to modules with low correlations and assigns the lowest probabilities to genes in modules with high negative module-Diagnosis correlation

negative_set %>% ggplot(aes(MTcor, prob, color=GS)) + geom_point() + 
                 geom_smooth(method='loess', color='#666666') + 
                 geom_hline(yintercept=mean(negative_set$prob), color='gray', linetype='dashed') +
                 scale_color_gradientn(colours=c('#F8766D','#F8766D','white','#00BFC4','#00BFC4')) + 
                 xlab('Modules ordered by their correlation to ASD') + ylab('Model probability') +
                 theme_minimal()




Summarised version, plotting by module instead of by gene


The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same

Again, the model seems to give lower probabilities to genes belonging to modules with a low negatuve correlation to Diagnosis than to the rest

plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob), n = n()) %>%
            mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>% 
            left_join(original_dataset, by='MTcor') %>%
            dplyr::select(Module, MTcor, MTcor_sign, mean, sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'

ggplotly(plot_data %>% ggplot(aes(MTcor, mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID), alpha=0.7) + 
         geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) + 
         xlab('Module-Diagnosis correlation') + ylab('Mean Probability by Module') + theme_minimal())




Probability and level of expression


# Gupta dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
rownames(datExpr) = datGenes$ensembl_gene_id
DE_info = DE_info %>% data.frame
rownames(DE_info) = datGenes$ensembl_gene_id
datMeta = datMeta %>% mutate(ID = title)

The relation between mean level of expression of the genes and the probability assigned by the model is weaker than in Gandal’s dataset, but it’s still there

mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))

plot_data = negative_set %>% left_join(mean_and_sd, by='ID') %>% 
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>% 
                      dplyr::select(ID, Module), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'

plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.2, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) +  
              geom_smooth(method='lm', color='#808080', se=FALSE, linetype = 'dashed') + 
              xlab('Mean Expression') + ylab('Probability') + 
              ggtitle('Mean expression vs model probability by gene') +
              theme_minimal()

rm(mean_and_sd)

Grouping the genes by module we can see a relatively strong and linear relation between mean level of expression and model probability

plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob), 
                                                          n=n())

ggplotly(plot_data2 %>% ggplot(aes(meanExpr, meanProb, size=n)) + 
         geom_point(color=plot_data2$Module, alpha = 0.6) + 
         geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) + 
         geom_smooth(method='lm', color='#808080', se=FALSE, linetype = 'dashed') + 
         xlab('Mean Level of Expression') + ylab('Average Model Probability') +
         theme_minimal() + theme(legend.position='none') + 
         ggtitle('Mean expression vs model probability by Module'))
rm(plot_data2)




Probability and LFC


There is a relation between probability and LFC, so it IS capturing a bit of true information (because LFC and mean expression were negatively correlated and it still has a positive relation in the model)

  • The relation is stronger in over-expressed in ASD (opposite to the behaviour found in Gandal’s dataset)
plot_data = negative_set %>% left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID') %>%
            dplyr::rename('log2FoldChange' = logFC, 'padj' = adj.P.Val)

plot_data %>% ggplot(aes(log2FoldChange, prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + xlab('LFC') +
              xlab('LFC') + ylab('Probability') +
              theme_minimal() + ggtitle('LFC vs model probability by gene')



Conclusion


The model is capturing the mean level of expression of the genes (indirectly through module memberhsip), which is a strong bias found in the SFARI scores, but it seems to be capturing a bit of true biological signal as well (based on the GS and the log fold change plot)


Saving results

predictions = test_set %>% left_join(gene_names, by = c('ID' = 'ensembl_gene_id'))

save(predictions, dataset, fit, file='./../Data/Ridge_model.RData')




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] DMwR_0.4.1         doParallel_1.0.15  iterators_1.0.12   foreach_1.5.0     
##  [5] kableExtra_1.1.0   expss_0.10.2       corrplot_0.84      MLmetrics_1.1.1   
##  [9] car_3.0-7          carData_3.0-3      ROCR_1.0-7         gplots_3.0.3      
## [13] caret_6.0-86       lattice_0.20-41    polycor_0.7-10     biomaRt_2.40.5    
## [17] ggpubr_0.2.5       magrittr_1.5       RColorBrewer_1.1-2 gridExtra_2.3     
## [21] viridis_0.5.1      viridisLite_0.3.0  plotly_4.9.2       knitr_1.28        
## [25] forcats_0.5.0      stringr_1.4.0      dplyr_1.0.0        purrr_0.3.4       
## [29] readr_1.3.1        tidyr_1.1.0        tibble_3.0.1       ggplot2_3.3.2     
## [33] tidyverse_1.3.0   
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1         backports_1.1.8      plyr_1.8.6          
##   [4] lazyeval_0.2.2       splines_3.6.3        crosstalk_1.1.0.1   
##   [7] digest_0.6.25        htmltools_0.4.0      gdata_2.18.0        
##  [10] fansi_0.4.1          checkmate_2.0.0      memoise_1.1.0       
##  [13] openxlsx_4.1.4       recipes_0.1.10       modelr_0.1.6        
##  [16] gower_0.2.1          matrixStats_0.56.0   xts_0.12-0          
##  [19] prettyunits_1.1.1    colorspace_1.4-1     blob_1.2.1          
##  [22] rvest_0.3.5          haven_2.2.0          xfun_0.12           
##  [25] crayon_1.3.4         RCurl_1.98-1.2       jsonlite_1.7.0      
##  [28] zoo_1.8-8            survival_3.1-12      glue_1.4.1          
##  [31] gtable_0.3.0         ipred_0.9-9          webshot_0.5.2       
##  [34] shape_1.4.4          quantmod_0.4.17      BiocGenerics_0.30.0 
##  [37] abind_1.4-5          scales_1.1.1         DBI_1.1.0           
##  [40] miniUI_0.1.1.1       Rcpp_1.0.4.6         xtable_1.8-4        
##  [43] progress_1.2.2       htmlTable_1.13.3     foreign_0.8-76      
##  [46] bit_1.1-15.2         stats4_3.6.3         lava_1.6.7          
##  [49] prodlim_2019.11.13   glmnet_3.0-2         htmlwidgets_1.5.1   
##  [52] httr_1.4.1           ellipsis_0.3.1       pkgconfig_2.0.3     
##  [55] XML_3.99-0.3         farver_2.0.3         nnet_7.3-14         
##  [58] dbplyr_1.4.2         later_1.0.0          tidyselect_1.1.0    
##  [61] labeling_0.3         rlang_0.4.6          reshape2_1.4.4      
##  [64] AnnotationDbi_1.46.1 munsell_0.5.0        cellranger_1.1.0    
##  [67] tools_3.6.3          cli_2.0.2            generics_0.0.2      
##  [70] RSQLite_2.2.0        broom_0.5.5          fastmap_1.0.1       
##  [73] evaluate_0.14        yaml_2.2.1           ModelMetrics_1.2.2.2
##  [76] bit64_0.9-7          fs_1.4.0             zip_2.0.4           
##  [79] caTools_1.18.0       nlme_3.1-147         mime_0.9            
##  [82] ggExtra_0.9          xml2_1.2.5           compiler_3.6.3      
##  [85] rstudioapi_0.11      curl_4.3             ggsignif_0.6.0      
##  [88] reprex_0.3.0         stringi_1.4.6        highr_0.8           
##  [91] Matrix_1.2-18        vctrs_0.3.1          pillar_1.4.4        
##  [94] lifecycle_0.2.0      data.table_1.12.8    bitops_1.0-6        
##  [97] httpuv_1.5.2         R6_2.4.1             promises_1.1.0      
## [100] KernSmooth_2.23-17   rio_0.5.16           IRanges_2.18.3      
## [103] codetools_0.2-16     MASS_7.3-51.6        gtools_3.8.2        
## [106] assertthat_0.2.1     withr_2.2.0          S4Vectors_0.22.1    
## [109] mgcv_1.8-31          hms_0.5.3            rpart_4.1-15        
## [112] timeDate_3043.102    class_7.3-17         rmarkdown_2.1       
## [115] TTR_0.23-6           pROC_1.16.2          shiny_1.4.0.2       
## [118] Biobase_2.44.0       lubridate_1.7.4